In [27]:
from pandas_patch import *
In [30]:
%psource structure
In [11]:
def get_test_df_complete():
""" get the full test dataset from Lending Club open source database,
the purpose of this fuction is to be used in a demo ipython notebook """
import requests
from zipfile import ZipFile
from StringIO import StringIO
zip_to_download = "https://resources.lendingclub.com/LoanStats3b.csv.zip"
r = requests.get(zip_to_download)
zipfile = ZipFile(StringIO(r.content))
file_csv = zipfile.namelist()[0]
df = pd.read_csv(zipfile.open(file_csv), skiprows =[0], na_values = ['n/a','N/A',''],
parse_dates = ['issue_d','last_pymnt_d','next_pymnt_d','last_credit_pull_d'] )
zipfile.close()
df = df[:-2]
nb_row = float(len(df.index))
df['na_col'] = np.nan
df['constant_col'] = 'constant'
df['duplicated_column'] = df.id
df['many_missing_70'] = np.nan
df.loc[1:int(0.3*nb_row),'many_missing_70'] = 1
df['bad'] = 1
index_good = df['loan_status'].isin(['Fully Paid', 'Current','In Grace Period'])
df.loc[index_good,'bad'] = 0
return df
In [12]:
# ipython tips
# with psource you can see the source code of a function
%psource pandas_patch
?nacount # to get info about the functions and docs
In [13]:
#df = get_test_df_complete()
# because no wifi connection
df = get_test_df_complete()
/Users/efourrier/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1164: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
data = self._reader.read(nrows)
In [19]:
df.columns
Out[19]:
Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv', u'term', u'int_rate', u'installment', u'grade', u'sub_grade', u'emp_title', u'emp_length', u'home_ownership', u'annual_inc', u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state', u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths', u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc', u'pub_rec', u'revol_bal', u'revol_util', u'total_acc', u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt', u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int', u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee', u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d', u'last_credit_pull_d', u'collections_12_mths_ex_med', u'mths_since_last_major_derog', u'policy_code', u'na_col', u'constant_col', u'duplicated_column', u'many_missing_70', u'bad'], dtype='object')
In [20]:
df.nrow()
Out[20]:
188123
In [21]:
df.ncol()
Out[21]:
57
In [22]:
df.dfnum() #identify numeric variables
Out[22]:
['member_id',
'loan_amnt',
'funded_amnt',
'funded_amnt_inv',
'installment',
'annual_inc',
'dti',
'delinq_2yrs',
'inq_last_6mths',
'mths_since_last_delinq',
'mths_since_last_record',
'open_acc',
'pub_rec',
'revol_bal',
'total_acc',
'out_prncp',
'out_prncp_inv',
'total_pymnt',
'total_pymnt_inv',
'total_rec_prncp',
'total_rec_int',
'total_rec_late_fee',
'recoveries',
'collection_recovery_fee',
'last_pymnt_amnt',
'collections_12_mths_ex_med',
'mths_since_last_major_derog',
'policy_code',
'na_col',
'many_missing_70',
'bad']
In [23]:
df.dfchar() # identify character variables
Out[23]:
['id',
'term',
'int_rate',
'grade',
'sub_grade',
'emp_title',
'emp_length',
'home_ownership',
'is_inc_v',
'loan_status',
'pymnt_plan',
'url',
'desc',
'purpose',
'title',
'zip_code',
'addr_state',
'earliest_cr_line',
'revol_util',
'initial_list_status',
'constant_col',
'duplicated_column']
In [24]:
timeit df.factors()
1 loops, best of 3: 1.98 s per loop
In [118]:
df.nacount(axis = 0) # counting the number of missing values per column
Out[118]:
Nanumber
Napercentage
id
0
0.000000
member_id
0
0.000000
loan_amnt
0
0.000000
funded_amnt
0
0.000000
funded_amnt_inv
0
0.000000
term
0
0.000000
int_rate
0
0.000000
installment
0
0.000000
grade
0
0.000000
sub_grade
0
0.000000
emp_title
17734
0.059915
emp_length
0
0.000000
home_ownership
0
0.000000
annual_inc
0
0.000000
is_inc_v
0
0.000000
issue_d
0
0.000000
loan_status
0
0.000000
pymnt_plan
0
0.000000
url
0
0.000000
desc
232074
0.784068
purpose
0
0.000000
title
5
0.000017
zip_code
0
0.000000
addr_state
0
0.000000
dti
0
0.000000
delinq_2yrs
0
0.000000
earliest_cr_line
0
0.000000
fico_range_low
0
0.000000
fico_range_high
0
0.000000
inq_last_6mths
0
0.000000
mths_since_last_delinq
156366
0.528287
mths_since_last_record
251069
0.848243
open_acc
0
0.000000
pub_rec
0
0.000000
revol_bal
0
0.000000
revol_util
162
0.000547
total_acc
0
0.000000
initial_list_status
0
0.000000
out_prncp
0
0.000000
out_prncp_inv
0
0.000000
total_pymnt
0
0.000000
total_pymnt_inv
0
0.000000
total_rec_prncp
0
0.000000
total_rec_int
0
0.000000
total_rec_late_fee
0
0.000000
recoveries
0
0.000000
collection_recovery_fee
0
0.000000
last_pymnt_d
186
0.000628
last_pymnt_amnt
0
0.000000
next_pymnt_d
38805
0.131104
last_credit_pull_d
25
0.000084
last_fico_range_high
0
0.000000
last_fico_range_low
0
0.000000
collections_12_mths_ex_med
0
0.000000
mths_since_last_major_derog
223698
0.755770
policy_code
0
0.000000
In [119]:
df.nacount(axis = 1) # count the number of missing values per rows
Out[119]:
Nanumber
Napercentage
0
4
0.071429
1
4
0.071429
2
2
0.035714
3
1
0.017857
4
2
0.035714
5
3
0.053571
6
2
0.035714
7
3
0.053571
8
4
0.071429
9
4
0.071429
10
2
0.035714
11
2
0.035714
12
4
0.071429
13
3
0.053571
14
3
0.053571
15
4
0.071429
16
2
0.035714
17
4
0.071429
18
4
0.071429
19
2
0.035714
20
2
0.035714
21
2
0.035714
22
4
0.071429
23
4
0.071429
24
2
0.035714
25
2
0.035714
26
2
0.035714
27
3
0.053571
28
4
0.071429
29
4
0.071429
...
...
...
295957
3
0.053571
295958
3
0.053571
295959
1
0.017857
295960
2
0.035714
295961
2
0.035714
295962
3
0.053571
295963
3
0.053571
295964
5
0.089286
295965
1
0.017857
295966
3
0.053571
295967
3
0.053571
295968
4
0.071429
295969
3
0.053571
295970
3
0.053571
295971
4
0.071429
295972
4
0.071429
295973
2
0.035714
295974
4
0.071429
295975
4
0.071429
295976
1
0.017857
295977
4
0.071429
295978
2
0.035714
295979
2
0.035714
295980
2
0.035714
295981
4
0.071429
295982
3
0.053571
295983
3
0.053571
295984
2
0.035714
295985
2
0.035714
295986
3
0.053571
295987 rows × 2 columns
In [96]:
df.constantcol() # find the constant columns
Out[96]:
['policy_code']
In [97]:
df.findupcol() # find the duplicate columns
Out[97]:
[]
In [138]:
timeit df.detectkey(pct = )
1 loops, best of 3: 1.37 s per loop
In [37]:
timeit df.apply(lambda x: len(pd.unique(x)))
1 loops, best of 3: 7.66 s per loop
In [38]:
timeit df.count_unique()
1 loops, best of 3: 7.77 s per loop
In [ ]:
In [20]:
df.manymissing(a = 0.7)
Out[20]:
Index([u'desc', u'mths_since_last_record', u'mths_since_last_major_derog'], dtype='object')
In [31]:
timeit df.structure()
1 loops, best of 3: 11.1 s per loop
In [22]:
df.psummary(dynamic = True)
there are 0 duplicated rows
the columns with more than 70.00% manymissing values:
['desc', 'mths_since_last_record', 'mths_since_last_major_derog']
the columns with less than 5.00% manymissing values are :
['title', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']
you should fill them with median or most common value
the detected keys of the dataset are:
['id', 'member_id', 'url']
the duplicated columns of the dataset are:
[]
the constant columns of the dataset are:
['policy_code']
the columns with nearzerovariance are:
['pymnt_plan', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med']
0.999999954602
0.9999995703
0.999997894256
0.999997750074
0.999995699105
0.990938422868
0.967614558757
0.951902875674
0.875725744027
the columns highly correlated to others to remove are:
['fico_range_low', 'funded_amnt', 'total_pymnt_inv', 'out_prncp_inv', 'funded_amnt_inv', 'id', 'total_pymnt', 'loan_amnt']
these columns contains big strings :
['emp_title', 'url', 'desc']
In [27]:
df.str
Out[27]:
id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
...
collection_recovery_fee
last_pymnt_d
last_pymnt_amnt
next_pymnt_d
last_credit_pull_d
last_fico_range_high
last_fico_range_low
collections_12_mths_ex_med
mths_since_last_major_derog
policy_code
0
29003494
31526675
28000
28000
28000
36 months
0.071
866.10
A
A3
...
0
14-Nov
866.10
14-Dec
14-Nov
719
715
0
NaN
1
1
28913233
31436395
3000
3000
3000
36 months
0.117
99.18
B
B4
...
0
14-Oct
99.18
14-Dec
14-Nov
689
685
0
NaN
1
2
27592013
30095159
18000
18000
18000
36 months
0.140
615.03
C
C3
...
0
14-Nov
615.03
14-Dec
14-Nov
699
695
0
27
1
3
28172444
30685592
15000
15000
15000
60 months
0.202
399.08
E
E3
...
0
14-Nov
399.08
14-Dec
14-Nov
664
660
0
70
1
4
20349337
22622030
23275
23275
23275
60 months
0.176
585.61
D
D4
...
0
14-Nov
585.61
15-Jan
14-Nov
709
705
0
57
1
5
27502482
30005666
21000
21000
21000
60 months
0.140
488.42
C
C3
...
0
14-Nov
488.42
14-Dec
14-Nov
744
740
0
NaN
1
6
27702335
30215519
7800
7800
7800
36 months
0.110
255.33
B
B3
...
0
14-Nov
255.33
14-Dec
14-Nov
674
670
0
43
1
7
28032412
30545550
13625
13625
13625
36 months
0.163
480.97
D
D2
...
0
14-Nov
480.97
14-Dec
14-Nov
719
715
0
NaN
1
8
28032523
30545669
17225
17225
17225
60 months
0.140
400.62
C
C3
...
0
14-Nov
400.62
14-Dec
14-Nov
759
755
0
NaN
1
9
28042338
30555469
8150
8150
8150
36 months
0.092
259.82
B
B1
...
0
14-Nov
259.82
14-Dec
14-Nov
769
765
0
NaN
1
10
28052496
30565639
2000
2000
2000
36 months
0.092
63.76
B
B1
...
0
14-Nov
63.76
14-Dec
14-Nov
709
705
0
25
1
11
28062557
30575706
25000
25000
25000
60 months
0.145
588.08
C
C4
...
0
14-Nov
588.08
14-Dec
14-Nov
709
705
0
69
1
12
28072219
30585351
20000
20000
20000
60 months
0.130
454.96
C
C1
...
0
14-Nov
454.96
14-Dec
14-Nov
749
745
0
NaN
1
13
28072334
30585477
5000
5000
5000
36 months
0.110
163.67
B
B3
...
0
14-Nov
163.67
14-Dec
14-Nov
754
750
0
NaN
1
14
28102226
30615365
9000
9000
9000
36 months
0.125
301.04
B
B5
...
0
14-Nov
301.04
14-Dec
14-Nov
704
700
0
NaN
1
15
28102242
30615381
8000
8000
8000
36 months
0.163
282.41
D
D2
...
0
14-Nov
282.41
14-Dec
14-Nov
709
705
0
NaN
1
16
28112391
30625533
18625
18625
18625
60 months
0.163
455.80
D
D2
...
0
14-Nov
455.80
14-Dec
14-Nov
739
735
0
30
1
17
28122281
30635409
6825
6825
6825
36 months
0.150
236.56
C
C5
...
0
14-Nov
236.56
14-Dec
14-Nov
684
680
0
NaN
1
18
28122316
30635450
7000
7000
7000
36 months
0.145
240.92
C
C4
...
0
14-Nov
240.92
14-Dec
14-Nov
669
665
0
NaN
1
19
28122319
30635453
15000
15000
15000
60 months
0.163
367.09
D
D2
...
0
14-Nov
367.09
14-Dec
14-Nov
724
720
0
46
1
20
28132525
30645679
8000
8000
8000
36 months
0.125
267.60
B
B5
...
0
14-Nov
267.60
14-Dec
14-Nov
719
715
0
16
1
21
28192471
30705604
20000
20000
20000
60 months
0.150
475.70
C
C5
...
0
14-Nov
475.70
14-Dec
14-Nov
714
710
0
41
1
22
28202142
30715268
6075
6075
6075
36 months
0.182
220.36
D
D5
...
0
14-Nov
220.36
14-Dec
14-Nov
689
685
0
NaN
1
23
28212255
30725390
20000
20000
20000
60 months
0.110
434.75
B
B3
...
0
14-Nov
434.75
14-Dec
14-Nov
714
710
0
NaN
1
24
27611626
30114751
6400
6400
6400
36 months
0.170
228.15
D
D3
...
0
14-Nov
228.15
14-Dec
14-Nov
669
665
0
NaN
1
25
28032263
30545390
6075
6075
6075
36 months
0.150
210.57
C
C5
...
0
14-Nov
210.57
14-Dec
14-Nov
674
670
0
74
1
26
28042070
30555187
16000
16000
16000
60 months
0.134
366.93
C
C2
...
0
14-Nov
366.93
14-Dec
14-Nov
699
695
0
62
1
27
28042197
30555320
20000
20000
20000
36 months
0.117
661.14
B
B4
...
0
14-Nov
661.14
14-Dec
14-Nov
674
670
0
NaN
1
28
28052043
30565161
21000
21000
20900
36 months
0.102
679.10
B
B2
...
0
14-Nov
679.10
14-Dec
14-Nov
754
750
0
NaN
1
29
28052390
30565524
4400
4400
4400
36 months
0.163
155.33
D
D2
...
0
14-Nov
155.33
14-Dec
14-Nov
764
760
0
NaN
1
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
295957
2381629
2844317
10000
10000
10000
36 months
0.141
342.22
B
B5
...
0
14-Sep
5307.47
NaN
14-Sep
704
700
0
25
1
295958
2371607
2834360
28000
28000
28000
36 months
0.089
889.09
A
A5
...
0
14-Nov
889.09
14-Dec
14-Nov
734
730
0
NaN
1
295959
2381552
2844239
13975
13975
13975
36 months
0.178
503.62
D
D1
...
0
14-Oct
503.62
14-Dec
14-Nov
554
550
0
25
1
295960
2371355
2834090
10575
10575
10575
36 months
0.153
368.20
C
C2
...
0
14-Nov
368.20
14-Dec
14-Nov
589
585
0
NaN
1
295961
2381291
2843953
10000
10000
10000
36 months
0.102
323.43
B
B1
...
0
14-Nov
323.43
14-Dec
14-Nov
714
710
0
80
1
295962
1867268
2169638
24600
24600
24575
60 months
0.102
524.62
B
B1
...
0
14-Nov
524.62
14-Dec
14-Nov
759
755
0
NaN
1
295963
2380286
2842889
8500
8500
8500
36 months
0.060
258.71
A
A1
...
0
14-Nov
258.71
14-Dec
14-Nov
814
810
0
NaN
1
295964
2369570
2832161
16000
16000
16000
60 months
0.089
331.36
A
A5
...
0
14-Jul
174.92
NaN
14-Nov
794
790
0
NaN
1
295965
2380722
2843349
20000
20000
20000
60 months
0.131
456.19
B
B4
...
0
14-Nov
456.19
14-Dec
14-Nov
744
740
0
31
1
295966
2380728
2843355
18225
18225
18225
36 months
0.089
578.71
A
A5
...
0
14-Nov
578.71
14-Dec
14-Nov
729
725
0
NaN
1
295967
2370680
2833360
6000
6000
6000
36 months
0.102
194.06
B
B1
...
0
14-May
194.06
NaN
14-Oct
529
525
0
NaN
1
295968
2380321
2842924
16750
16750
16750
36 months
0.102
541.74
B
B1
...
0
14-Nov
541.74
14-Dec
14-Nov
714
710
0
NaN
1
295969
2370278
2832933
12000
12000
12000
36 months
0.102
388.11
B
B1
...
0
14-Nov
388.11
14-Dec
14-Nov
729
725
0
NaN
1
295970
2366346
2828664
19750
19750
19750
60 months
0.230
556.20
F
F1
...
0
14-Nov
15488.41
NaN
14-Nov
714
710
0
NaN
1
295971
2308648
2741109
20000
20000
20000
36 months
0.089
635.07
A
A5
...
0
14-Nov
635.07
14-Dec
14-Nov
724
720
0
NaN
1
295972
2379355
2841866
6000
6000
6000
36 months
0.205
224.49
E
E1
...
0
14-Aug
3517.06
NaN
14-Aug
684
680
0
NaN
1
295973
2301746
2734342
15000
15000
15000
36 months
0.141
513.33
B
B5
...
0
14-Nov
513.33
14-Dec
14-Nov
679
675
0
49
1
295974
2377558
2839922
18775
18775
18775
36 months
0.102
607.23
B
B1
...
0
14-Nov
607.23
14-Dec
14-Nov
714
710
0
NaN
1
295975
2377252
2839601
15000
15000
15000
36 months
0.111
492.08
B
B2
...
0
14-Nov
492.08
14-Dec
14-Nov
614
610
0
NaN
1
295976
2366519
2828844
35000
35000
34900
60 months
0.205
936.86
E
E1
...
0
14-Nov
936.86
14-Dec
14-Nov
699
695
0
34
1
295977
2367122
2829498
24000
24000
24000
36 months
0.066
736.89
A
A2
...
0
14-Nov
736.89
14-Dec
14-Nov
699
695
0
NaN
1
295978
2298828
2731287
23000
23000
23000
36 months
0.141
787.10
B
B5
...
0
14-Nov
787.10
14-Dec
14-Nov
699
695
0
NaN
1
295979
2375433
2837641
25000
25000
24950
36 months
0.131
843.68
B
B4
...
0
14-Nov
843.68
14-Dec
14-Nov
694
690
0
52
1
295980
2365716
2827951
21000
21000
21000
36 months
0.089
666.82
A
A5
...
0
14-Nov
666.82
14-Dec
14-Nov
734
730
0
6
1
295981
2375143
2837335
11500
11500
11450
36 months
0.111
377.26
B
B2
...
0
14-Nov
377.26
14-Dec
14-Nov
694
690
0
NaN
1
295982
2334898
2777049
16000
16000
15750
60 months
0.102
341.22
B
B1
...
0
14-Nov
341.22
14-Dec
14-Nov
729
725
0
NaN
1
295983
2375068
2837252
18000
18000
17950
36 months
0.188
657.54
D
D3
...
0
14-Nov
657.54
14-Dec
14-Nov
644
640
0
NaN
1
295984
2374791
2836963
17000
17000
17000
36 months
0.121
565.62
B
B3
...
0
14-Nov
565.62
14-Dec
14-Nov
674
670
0
22
1
295985
2301035
2733598
31500
31500
31500
36 months
0.121
1048.06
B
B3
...
0
14-Nov
1048.06
14-Dec
14-Nov
724
720
0
45
1
295986
2300581
2733123
28000
28000
27750
36 months
0.079
876.13
A
A4
...
0
14-Nov
876.13
14-Dec
14-Nov
754
750
0
NaN
1
295987 rows × 56 columns
In [48]:
%psource structure
In [49]:
df.nacount(axis = 0).Napercentage
Out[49]:
id 0.000000
member_id 0.000000
loan_amnt 0.000000
funded_amnt 0.000000
funded_amnt_inv 0.000000
term 0.000000
int_rate 0.000000
installment 0.000000
grade 0.000000
sub_grade 0.000000
emp_title 0.059915
emp_length 0.000000
home_ownership 0.000000
annual_inc 0.000000
is_inc_v 0.000000
issue_d 0.000000
loan_status 0.000000
pymnt_plan 0.000000
url 0.000000
desc 0.784068
purpose 0.000000
title 0.000017
zip_code 0.000000
addr_state 0.000000
dti 0.000000
delinq_2yrs 0.000000
earliest_cr_line 0.000000
fico_range_low 0.000000
fico_range_high 0.000000
inq_last_6mths 0.000000
mths_since_last_delinq 0.528287
mths_since_last_record 0.848243
open_acc 0.000000
pub_rec 0.000000
revol_bal 0.000000
revol_util 0.000547
total_acc 0.000000
initial_list_status 0.000000
out_prncp 0.000000
out_prncp_inv 0.000000
total_pymnt 0.000000
total_pymnt_inv 0.000000
total_rec_prncp 0.000000
total_rec_int 0.000000
total_rec_late_fee 0.000000
recoveries 0.000000
collection_recovery_fee 0.000000
last_pymnt_d 0.000628
last_pymnt_amnt 0.000000
next_pymnt_d 0.131104
last_credit_pull_d 0.000084
last_fico_range_high 0.000000
last_fico_range_low 0.000000
collections_12_mths_ex_med 0.000000
mths_since_last_major_derog 0.755770
policy_code 0.000000
Name: Napercentage, Length: 56, dtype: float64
In [32]:
%timeit df.nacount()
1 loops, best of 3: 239 ms per loop
In [34]:
timeit df.count_unique()
1 loops, best of 3: 7.47 s per loop
In [67]:
df.count()
Out[67]:
id 295987
member_id 295987
loan_amnt 295987
funded_amnt 295987
funded_amnt_inv 295987
term 295987
int_rate 295987
installment 295987
grade 295987
sub_grade 295987
emp_title 278253
emp_length 295987
home_ownership 295987
annual_inc 295987
is_inc_v 295987
issue_d 295987
loan_status 295987
pymnt_plan 295987
url 295987
desc 63913
purpose 295987
title 295982
zip_code 295987
addr_state 295987
dti 295987
delinq_2yrs 295987
earliest_cr_line 295987
fico_range_low 295987
fico_range_high 295987
inq_last_6mths 295987
mths_since_last_delinq 139621
mths_since_last_record 44918
open_acc 295987
pub_rec 295987
revol_bal 295987
revol_util 295825
total_acc 295987
initial_list_status 295987
out_prncp 295987
out_prncp_inv 295987
total_pymnt 295987
total_pymnt_inv 295987
total_rec_prncp 295987
total_rec_int 295987
total_rec_late_fee 295987
recoveries 295987
collection_recovery_fee 295987
last_pymnt_d 295801
last_pymnt_amnt 295987
next_pymnt_d 257182
last_credit_pull_d 295962
last_fico_range_high 295987
last_fico_range_low 295987
collections_12_mths_ex_med 295987
mths_since_last_major_derog 72289
policy_code 295987
Length: 56, dtype: int64
In [68]:
df.nacount()
Out[68]:
Nanumber
Napercentage
id
0
0.000000
member_id
0
0.000000
loan_amnt
0
0.000000
funded_amnt
0
0.000000
funded_amnt_inv
0
0.000000
term
0
0.000000
int_rate
0
0.000000
installment
0
0.000000
grade
0
0.000000
sub_grade
0
0.000000
emp_title
17734
0.059915
emp_length
0
0.000000
home_ownership
0
0.000000
annual_inc
0
0.000000
is_inc_v
0
0.000000
issue_d
0
0.000000
loan_status
0
0.000000
pymnt_plan
0
0.000000
url
0
0.000000
desc
232074
0.784068
purpose
0
0.000000
title
5
0.000017
zip_code
0
0.000000
addr_state
0
0.000000
dti
0
0.000000
delinq_2yrs
0
0.000000
earliest_cr_line
0
0.000000
fico_range_low
0
0.000000
fico_range_high
0
0.000000
inq_last_6mths
0
0.000000
mths_since_last_delinq
156366
0.528287
mths_since_last_record
251069
0.848243
open_acc
0
0.000000
pub_rec
0
0.000000
revol_bal
0
0.000000
revol_util
162
0.000547
total_acc
0
0.000000
initial_list_status
0
0.000000
out_prncp
0
0.000000
out_prncp_inv
0
0.000000
total_pymnt
0
0.000000
total_pymnt_inv
0
0.000000
total_rec_prncp
0
0.000000
total_rec_int
0
0.000000
total_rec_late_fee
0
0.000000
recoveries
0
0.000000
collection_recovery_fee
0
0.000000
last_pymnt_d
186
0.000628
last_pymnt_amnt
0
0.000000
next_pymnt_d
38805
0.131104
last_credit_pull_d
25
0.000084
last_fico_range_high
0
0.000000
last_fico_range_low
0
0.000000
collections_12_mths_ex_med
0
0.000000
mths_since_last_major_derog
223698
0.755770
policy_code
0
0.000000
In [79]:
1 >= 2
Out[79]:
False
In [80]:
df.int_rate.dtype
Out[80]:
dtype('float64')
In [103]:
In [127]:
df.sample_df(pct = 0.10).nrow()
Out[127]:
29598
In [10]:
df.factors()
1 loops, best of 3: 804 ms per loop
In [15]:
timeit df.detectkey(pct = 0.05)
1 loops, best of 3: 642 ms per loop
In [16]:
timeit df.detectkey2()
1 loops, best of 3: 3.37 s per loop
In [18]:
df.nearzerovar()
freq_ratio nzv percent_unique zero_var
id 1.000000 False 100.000000 False
member_id 1.000000 False 100.000000 False
loan_amnt 1.319625 False 0.446303 False
funded_amnt 1.319625 False 0.446303 False
funded_amnt_inv 1.350674 False 0.457115 False
term 2.556126 False 0.000676 False
int_rate 1.049433 False 0.033785 False
installment 1.211738 False 13.732698 False
grade 1.042449 False 0.002365 False
sub_grade 1.019850 False 0.011825 False
emp_title 1.253420 False 44.487765 False
emp_length 3.964490 False 0.004054 False
home_ownership 1.358463 False 0.001351 False
annual_inc 1.116673 False 7.601347 False
is_inc_v 1.177140 False 0.001014 False
issue_d 1.534426 False 0.007095 False
loan_status 7.741702 False 0.002365 False
pymnt_plan 3362.488636 True 0.000676 False
url 1.000000 False 100.000000 False
desc 1.100000 False 21.210053 False
purpose 2.482576 False 0.004392 False
title 2.558264 False 11.264684 False
zip_code 1.028125 False 0.292243 False
addr_state 1.795696 False 0.016893 False
dti 1.145299 False 1.321680 False
delinq_2yrs 6.479383 False 0.007771 False
earliest_cr_line 1.011780 False 0.219604 False
fico_range_low 1.025028 False 0.012838 False
fico_range_high 1.025028 False 0.012838 False
inq_last_6mths 1.814355 False 0.002365 False
mths_since_last_delinq 1.009972 False 0.047637 False
mths_since_last_record 1.012972 False 0.041218 False
open_acc 1.021711 False 0.020271 False
pub_rec 6.443615 False 0.007095 False
revol_bal 21.151515 False 17.034870 False
revol_util 1.272727 False 0.385490 False
total_acc 1.001182 False 0.034461 False
initial_list_status 1.532769 False 0.000676 False
out_prncp 337.547826 False 49.844419 False
out_prncp_inv 340.508772 False 52.214793 False
total_pymnt 1.447674 False 57.532256 False
total_pymnt_inv 1.364198 False 59.522547 False
total_rec_prncp 1.229989 False 48.328136 False
total_rec_int 1.512195 False 49.332910 False
total_rec_late_fee 856.357558 True 0.307446 False
recoveries 58780.200000 True 0.688206 False
collection_recovery_fee 98034.666667 True 0.630095 False
last_pymnt_d 6.877148 False 0.007771 False
last_pymnt_amnt 1.228931 False 23.844290 False
next_pymnt_d 40.179052 True 0.001689 False
last_credit_pull_d 59.006795 True 0.008784 False
last_fico_range_high 1.028124 False 0.024325 False
last_fico_range_low 1.028124 False 0.023988 False
collections_12_mths_ex_med 117.000399 True 0.002365 False
mths_since_last_major_derog 1.013688 False 0.054732 False
policy_code 1.000000 False 0.000338 True
Out[18]:
Index([u'pymnt_plan', u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee', u'next_pymnt_d', u'last_credit_pull_d', u'collections_12_mths_ex_med'], dtype='object')
In [39]:
def pandas_to_ndarray_wrap(X, copy=True):
"""
Converts X to a ndarray and provides a function to help convert back
to pandas object.
Parameters
----------
X : Series/DataFrame/ndarray
copy : Boolean
If True, return a copy.
Returns
-------
Xvals : ndarray
If X is a Series/DataFrame, then Xvals = X.values,
if ndarray, Xvals = X
F : Function
F(Xvals) = X
"""
if copy:
X = X.copy()
if isinstance(X, pd.Series):
return X.values, lambda Z: pd.Series(np.squeeze(Z), index=X.index)
elif isinstance(X, pd.DataFrame):
return X.values, lambda Z: pd.DataFrame(
Z, index=X.index, columns=X.columns)
elif isinstance(X, np.ndarray) or isspmatrix(X):
return X, lambda Z: Z
else:
raise ValueError("Unhandled type: %s" % type(X))
In [40]:
pandas_to_ndarray_wrap(df)
Out[40]:
(array([[10159548, 12011167.0, 15000.0, ..., 10159548, nan, 0],
[10149488, 12001033.0, 4800.0, ..., 10149488, 1.0, 0],
[10129403, 11981032.0, 7550.0, ..., 10129403, 1.0, 0],
...,
['1059224', 1290827.0, 35000.0, ..., '1059224', nan, 0],
['1058722', 1290521.0, 12000.0, ..., '1058722', nan, 1],
['1058291', 1289878.0, 12000.0, ..., '1058291', nan, 0]], dtype=object),
<function __main__.<lambda>>)
In [41]:
pandas_to_ndarray_wrap(df)[0]
Out[41]:
array([[10159548, 12011167.0, 15000.0, ..., 10159548, nan, 0],
[10149488, 12001033.0, 4800.0, ..., 10149488, 1.0, 0],
[10129403, 11981032.0, 7550.0, ..., 10129403, 1.0, 0],
...,
['1059224', 1290827.0, 35000.0, ..., '1059224', nan, 0],
['1058722', 1290521.0, 12000.0, ..., '1058722', nan, 1],
['1058291', 1289878.0, 12000.0, ..., '1058291', nan, 0]], dtype=object)
In [44]:
df.size
Out[44]:
10723011
In [45]:
timeit df.duplicated()
1 loops, best of 3: 1.08 s per loop
In [ ]:
Content source: ericfourrier/pandas-patch
Similar notebooks: